This project is about Crime in Philadelphia
!pip install folium
import pandas as pd
import numpy as np
import folium
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn import linear_model
import os
from folium import IFrame
from folium.plugins import MarkerCluster
import seaborn as sb
import matplotlib.dates as mdates
import datetime
import warnings
warnings.filterwarnings("ignore")
This data is collected from Kaggle, https://www.kaggle.com/mchirico/philadelphiacrimedata. The data is in a CSV format in a file named crime.csv. After that it is loaded into a pandas dataframe to further process and visualize the data.
This data contains information about crime in Philadephia from 2005 to 2017.
# Date Parser for the data
dateparse = lambda d: datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S')
# Load the data into a dataframe.
df = pd.read_csv("crime.csv",
header=0,names=['Dc_Dist', 'Psa', 'Dispatch_Date_Time', 'Dispatch_Date',
'Dispatch_Time', 'Hour', 'Dc_Key', 'Location_Block', 'UCR_General',
'Crime_Type', 'Police_Districts', 'Month', 'Longitude',
'Latitude'],dtype={'Dc_Dist':str,'Psa':str,
'Dispatch_Date_Time':str,'Dispatch_Date':str,'Dispatch_Time':str,
'Hour':str,'Dc_Key':str,'Location_Block':str,
'UCR_General':str,'Crime_Type':str,
'Police_Districts':str,'Month':str,'Longitude':str,'Latitude':str},
parse_dates=['Dispatch_Date_Time'],date_parser=dateparse)
# Fix Month to datetime Month
df['Month'] = df['Month'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m'))
df.head()
Remove all unnecessary columns from the table and tidy any other column that needs tidying.
# First drop Location_Block, Dc_Key, Dc_Dist, Dispatch_Date_Time and Hour columns
df = df.drop('Location_Block', axis=1)
df = df.drop('Dc_Key', axis=1)
df = df.drop('Dc_Dist', axis=1)
df = df.drop('Dispatch_Date_Time', axis=1)
df = df.drop('Hour', axis=1)
# Then add two columns for Month and Year and change the name of the exisitng Month column to Crime_Date
df["Year"] = pd.DatetimeIndex(df["Month"]).year
df["Day"] = pd.DatetimeIndex(df["Dispatch_Date"]).day
df["Crime_Date"] = df["Month"]
df["Month"] = pd.DatetimeIndex(df["Crime_Date"]).month
# Here since this dataset is very large, dropping all NaN values would clean it without causing any bias
df2 = df.dropna()
df2 = df2[df2.Year != 2017]
df2.index = range(len(df2.index))
df2.head()
At this point, the data is clean and ready for analsis. This part includes visualization of the data based on different factors like year, month, crime type. It also includes a map to better visualize crime in Philadephia. In the end it has statistical measurements to understand how the parameters vary.
A Plot of total number of crimes commited in a given Month from 2006 to 2017.
Here as you can see in the graph below, the total number of crimes in Philadelphia seem to be decreasing with years.
sb.catplot(x="Year", kind="count", height=6, aspect=2, data=df2)
plt.xlabel("Year", fontsize=16)
plt.ylabel("Total Crime Number", fontsize=14)
plt.title("Number of Crimes commited per year", fontsize=16)
A Plot of total number of crimes commited in a given Month from 2006 to 2017. Here as you can see, the trend in crimes per Month doesn't look linear like the previous (Year) graph. Here it seems like more crimes happen in the middle of the year, compared to the start and end.
sb.catplot(x='Month', kind='count', height=6, aspect=2, data=df2)
plt.xlabel("Month", fontsize=16)
plt.ylabel("Total Crime Number", fontsize=14)
plt.title("Number of Crimes commited per Month", fontsize=16)
There are 33 different types of Crimes and the following graph visualized the frequency of each of that type to better understand how crime rates in Philadelphia are changing based on different types.
Here it can be seen that the number of thefts, vandalism, drug violations, frauds and burglaries are higher than the number of Arsons, Rapes, Criminal Homicides, Public Drukenness and Embezzlements.
sb.catplot(y='Crime_Type', kind ='count', height=8, aspect=2, order=df2.Crime_Type.value_counts().index,
data=df2)
plt.xlabel("Number of Crimes", fontsize=16)
plt.ylabel("Type of Crime", fontsize=14)
plt.title("Number of Times a Specific type of Crime was Commited", fontsize=16)
This graph shows the number of crimes per Police District in Philadelphia, it is ordered by the crime count.
From the graph below it can be seen that the number of crimes commited per police district has a lot of disparities, with district 11 having the highest number and district 22 having the smallest number
sb.catplot(x='Police_Districts', kind='count', height=6, aspect=2, order=df2.Police_Districts.value_counts().index,
data=df2)
plt.xlabel("Police District", fontsize=16)
plt.ylabel("Number of Crimes", fontsize=14)
plt.title("Number of Crimes commited per Police District", fontsize=16)
This next plot groups data by Month AND Day, which means that it shows average number of crime commited on a given day in a year. Using this graph it can be seen if there any specific days in year when crime is too high or too low?
From the below graph it can be seen that the number of Crimes in Philadelphia are surprisingly low on Holidays like Valentine's Day, Independence Day, Thanksgiving week (Since Thanksgiving doesn't have a specific date, the average of that entire week is low) and Christmas.
# Get average number of crime commited on each day of the Year for each 365 days
data_by_day_and_month = df.groupby(["Month", "Day"]).size() / 11
# Since only 3 crimes have occured on February 29th, divide it by 3
data_by_day_and_month[2][29] = (data_by_day_and_month[2][29] * 11) / 3
# Make sure that the xticks in the following graph are always on the first of the month
leap_year = [31,29,31,30,31,30,31,31,30,31,30,31]
ticks = []
n = 0
for days in leap_year:
ticks.append(n)
n += days
plot = data_by_day_and_month.plot(figsize=(16,7), xticks=ticks, color="blue")
plot.set(xlabel="(Month, Day)", ylabel="Average Number of Crimes")
plot.set_title("Numbers of Crimes per day in Philadelphia", fontweight="bold", fontsize=16)
# Valentine's Day
plt.arrow(15,435,20,20, width=1, color="black", head_starts_at_zero=False)
plt.text(15,420, "Valentine's Day")
# Independence Day
plt.arrow(158,435,20,20, width=1, color="black", head_starts_at_zero=False)
plt.text(134,420, "Independence Day")
# Thanksgiving
plt.arrow(295,420,20,20, width=1, color="black", head_starts_at_zero=False)
plt.text(275,402, "Thanksgiving Week")
# Christmas
plt.arrow(330,220,20,0, width=1, color="black", head_starts_at_zero=False)
plt.text(305,218, "Christmas")
This section uses the map of philadelphia to show the trends in Crime Data in Philadelphia for a better understanding of where in the city are most of the crimes concentrated.
We have included three maps:
Note that the reason for using different years is mainly because folium maps don't fetch for a dataset larger than a certain number
This map below shows the heatmap of crimes that happened in 2016. Here the Black color represents dangerous crimes, blue color represents thefts and cyan color represents other smaller crimes.
It is evident (just as expected) that number of dangerous crimes is less than thefts which is less than small crimes.
This graph uses a random sample of size 42,000 because for values higher than that the map doesn't render and using a random sample takes care of any bias that might occur when using a sample of the population.
from folium.plugins import HeatMap
from folium import plugins
from folium import FeatureGroup
from folium import IFrame
from folium.plugins import MarkerCluster
from random import randint
dangerous = ['Weapon Violations', 'Robbery Firearm', 'Homicide - Criminal', 'Aggravated Assault Firearm',
'Homicide - Gross Negligence', 'Homicide - Justifiable', 'Rape']
theft = ['Thefts', 'Theft from Vehicle', 'Motor Vehicle Theft', 'Receiving Stolen Property',
'Recovered Stolen Motor Vehicle']
map_osm = folium.Map(location=[39.95,-75.16], zoom_start=11)
arrest_loc = FeatureGroup(name="Crime")
temp_data = df2.sample(n=42000)[df2.Year == 2016]
for i,row in temp_data.iterrows():
if row['Crime_Type'] in dangerous:
arrest_loc.add_child(folium.Circle(radius=30, location=[row['Latitude'],row['Longitude']],color='black',fill=True))
elif row['Crime_Type'] in theft:
arrest_loc.add_child(folium.Circle(radius=30, location=[row['Latitude'],row['Longitude']],color='blue',fill=True))
else:
arrest_loc.add_child(folium.Circle(radius=30, location=[row['Latitude'],row['Longitude']],color='cyan',fill=True))
map_osm.add_child(arrest_loc)
map_osm.add_child(folium.map.LayerControl())
map_osm
This graph visualizes a heat map of the Violent crimes commited between 2011 and 2016 included. Here violent crimes include Weapon Violations, Robbery Firearm, Homicides, Aggravated Assaults and Rapes.
This graph shows where dangerous crimes happen most often
We have restricted the years from 2011 to 2016 because a large dataset causes problems with the Map.
map_osm2 = folium.Map(location=[39.95,-75.16], zoom_start=11)
# creating a new dataframe with all the dangerous crimes in it
dangerous_data = df2[df2['Crime_Type'].isin(dangerous)]
# Add data for heatmp
data_heatmap = dangerous_data[dangerous_data.Year > 2010]
data_heatmap = data_heatmap[['Latitude','Longitude']]
data_heatmap = [[row['Latitude'],row['Longitude']] for index, row in data_heatmap.iterrows()]
HeatMap(data_heatmap, radius=10).add_to(map_osm2)
map_osm2
This graph visualizes a heat map of the thefts commited between 2011 and 2016 included. Here thefts include Thefts, Theft from Vehicle, Motor Vehicle Theft, Receiving Stolen Property, Recovered Stolen Motor Vehicle.
This graph shows where thefts crimes happen most often
We have restricted the years from 2015 to 2016 because a large dataset causes problems with fetching the Map. But the sample here is large enough to give us a good idea of the actual data.
map_osm3 = folium.Map(location=[39.95,-75.16], zoom_start=11)
# creating a new dataframe with all the thefts crimes in it
theft_data = df2[df2['Crime_Type'].isin(theft)]
# Add data for heatmp
data_heatmap = theft_data[theft_data.Year > 2014].sample(frac=0.7)
data_heatmap = data_heatmap[['Latitude','Longitude']]
data_heatmap = [[row['Latitude'],row['Longitude']] for index, row in data_heatmap.iterrows()]
HeatMap(data_heatmap, radius=10).add_to(map_osm3)
map_osm3
Regression by Year and crime count.
#Use groupby and count functions to count the number of crimes in each year
data_by_year = df2.copy().groupby(df2['Year'], as_index=True, group_keys=True).count()
#Put indexes into the result table
count_by_year = data_by_year[['UCR_General']].reset_index()
#Instead of UCR_General, Count should be the name of the column
count_by_year = count_by_year.rename(index=str, columns={'UCR_General' : 'Count'})
count_by_year
# regression line for the number of crimes per year.
table = count_by_year
x_d=table['Year'].values
y_d=table['Count'].values
z=np.polyfit(x=x_d,y=y_d,deg=1)
f=np.poly1d(z)
x_n = np.linspace(x_d.min(), x_d.max(), 100)
y_n = f(x_n)
plt.figure(figsize=(15,10))
plt.plot(x_d, y_d,'o',x_n,y_n)
plt.xlabel("year")
plt.ylabel("number of crimes")
plt.title("Crimes in Philadelphia")
#linear regression 1
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm
count_year = count_by_year[['Year', 'Count']].sort_values(by=['Year'], ascending=True).reset_index(drop=True)
npMatrix = np.matrix(count_year)
x_value = npMatrix[:,0]
y_value = npMatrix[:,1]
line = LinearRegression().fit(x_value,y_value)
m = line.coef_[0]
b_value = line.intercept_
print ("y = {0}x + {1}".format(m, b_value))
x_data = count_year['Year'].values
y_data = count_year ['Count'].values
minimum = x_data.min()
maximum = x_data.max()
result = sm.ols(formula="Count ~ Year", data=count_year).fit()
print (result.summary())
x1 = np.linspace(minimum, maximum, 100)
y1 = x1*m+b_value
#Based on the regression line equation, on average, crime count decreases by -5840.33636364 every year.
#now we add another factor to our regression(crime type)
data_by_type = df2.copy().groupby(df2['Crime_Type'], as_index=True, group_keys=True).count()
count_crime_type = data_by_type[['UCR_General']].reset_index()
count_crime_type = count_crime_type.rename(index=str, columns={'UCR_General' : 'Count'})
count_crime_type
#linear regression 2
#another regression based on Year and Crime Type
crime_type_year = df2.copy()
crime_type_year = crime_type_year[['Year','Crime_Type']]
#Get the count associated with year and crime type
crime_type_year = crime_type_year.groupby(['Year','Crime_Type']).size()
crime_type_year = crime_type_year.reset_index()
#Rename count column
crime_type_year['Count'] = crime_type_year[0]
crime_type_year = crime_type_year.drop(0,1)
#Fit the second regression
regression2 = sm.ols(formula='Count ~ Year + Crime_Type + Year * Crime_Type', data=crime_type_year).fit()
regression2.summary()